/* 

Name : prepare_data_decile.do 
Authors: Martin, Méjean, Parenti
Last update: January 10, 2019 
Aim : construct a dataset with info on the length of transactions, by product, destination and size decile 


*/ 

cd $datapath

global firstyear 1996
global lastyear 2006
global time_constraint  //"drop if firstyear_panel!=firstyear_sample" in a robustness, we add a constraint to deal with left censoring 
global time_constraint2 "drop if firstyear_sample>2004" // drop right-censored seller-buyer relationships
global dim_decile $product 
global variant _baseline

use $dataset, clear
sort siren year month
by siren: g _=1 if _n==1
by siren: g __=1 if _n==_N
bys siren year month: egen firstdate_siren=mean(_)
bys siren year month: egen lastdate_siren=mean(__)
drop _ __

*** Drop buyers with a single transaction
bys buyer iso2 $product: egen _=count(export)
replace _=0 if _!=1
drop if _==1
drop _

*** we merge with French PPI to have exports in 2010 euros 

sort year month 
merge m:1 year month using $outsidedata2/ppi_france // monthly PPI data from INSEE 
keep if _m==3 
drop _m 
rename export export_n 
g export = export_n/ppi*100
label var export "real exports (deflated by PPI (2010=100))"

global dataset transac
save new_$dataset, replace

*** here we compute the length of transactions
construct_base_4decile new_$dataset base_4deciletransac$firstyear$lastyear "$time_constraint" "$time_constraint2" nomulti "$countrysample" // "$countrysample"
// construct_base4decile is the name of the program, 
// new_$dataset$firstyear$lastyear is the name of the input database, 
// base_4decile$firstyear$lastyear is the name of the output, 
// the parts under " " are the restrictions we impose: 
// $time_constraint treats right-censoring
// $time_constraint2 treats left-censoring
// this part is a bit boring => need to identify continuous relationships and compute their duration 
// note that we also compute switching probabilities (we use it in a discussion in the paper) 

*** build our group variables on the final sample 
use base_4deciletransac$firstyear$lastyear, clear
sort iso2 id_bp id_bs id_bs_continuous
id_group _ "iso2 $product id_bp" // id_group does the same job as egen group (but work if we have millions of groups)
drop id_bp
rename _ id_bp
id_group _ "iso2 $product id_bs"
drop id_bs
rename _ id_bs
id_group _ "iso2 $product id_bs_continuous"
drop id_bs_continuous
rename _ id_bs_continuous
save base_4deciletransac$firstyear$lastyear, replace



*** compute size decile 
// this is the time-consuming part. computing deciles for such a big dataset takes hours

use base_4deciletransac$firstyear$lastyear, clear
egen p=group($product)
drop if export==0
pwcorr num_transac       length_bps
/*
.   pwcorr num_transac       length_bps

             | num_tr~c length~s
-------------+------------------
 num_transac |   1.0000 
  length_bps |   0.7022   1.0000 
*/
g dur=num_transac // NEW number of transac rather than number of months 
g size=export  
collapse (mean) size dur  , by(id_bs_continuous $product iso2) 
g lsize=log(size)
egen _=mean(lsize), by($product)
g lsize_= size- _ 
egen min=pctile(lsize_), by($dim_decile)  p(1) 
egen max=pctile(lsize_), by($dim_decile)  p(99) 
egen decile=xtile(lsize_) , by($dim_decile) nq(10) // by product or product iso2? => makes more sense by product only 
drop if lsize_<min | lsize_>max // we truncate the first and tenth deciles to be fit with the theoretical framework (see appendix) 
save base_4deciletransac$firstyear$lastyear$variant, replace

use corres_nc8$firstyear$lastyear, clear
keep if year==2002
rename nc8 nc8_2002
keep id_conc nc8
sort id_conc
save temp, replace
	
use base_4deciletransac$firstyear$lastyear$variant, clear 
sort id_conc
joinby id_conc using temp

tostring nc8_2002, replace 
replace nc8_2002="0"+ nc8_2002 if length(nc8_2002)==7 
g hs6_2002=substr(nc8_2002,1,6) 
egen hs=group(hs6_2002)
save base_4regtransac, replace 

cd $datapath
estimate_bydecile base_4regtransac dur transac $product$firstyear$lastyear

use rs_baseline_id_conc$firstyear$lastyear, clear 
duplicates drop id_conc, force 
save tmp, replace 

use rstransac_id_conc19962006.dta, clear 
rename rs_hs6_b transac_bhs
rename rs_hs6_se transac_sehs 
keep transac* id_conc 
duplicates drop id_conc, force 
merge 1:1 id_conc using tmp
collapse (mean) rs_hs6_b transac_b rs_hs6_se, by(hs6_2002)
pwcorr rs_hs6_b transac_b
pwcorr rs_hs6_b transac_b [w=1/rs_hs6_se]

	local j=20
foreach i in transac_b {
putexcel B`j' = "`i'"	
corr rs_hs6_b `i'	[aw=1/rs_hs6_se]
local rho = r(rho)
putexcel C`j' =`rho'
tabstat `i', s(iqr n) save
mat total = r(StatTotal) 
return list
local stat1=total[1,1] 
putexcel D`j'=`stat1'
local stat2=total[2,1]  
putexcel E`j'=`stat2'
local j = `j'+1
}